####################This code generates figure 1 and runs the simulation with qog data####################
library(haven)

####################Set.seed####################
set.seed(213)

####################Set Working Directory to the Replication Folder####################

####################Read Cross-sectional Data####################
qogcs <- read_dta("qog_bas_cs_jan21.dta")
# Requires qog_bas_cs_jan21.dta available at: https://www.qogdata.pol.gu.se/data/qog_std_cs_jan21.dta (last accessed Jun 25, 2021)
# Teorell, Jan, Aksel Sundstrom, Saren Holmberg, Bo Rothstein, Natalia Alvarado Pachon & Cem Mert Dalli. 2021. The Quality of Government Standard Dataset, version Jan21. University of Gothenburg: The Quality of Government Institute, http://www.qog.pol.gu.se doi:10.18157/qogstdjan21

####################Drop country codes and version variables####################
qogcs_pure <- subset(qogcs,select = -c(ccode,cname,ccodealp,ccodecow,ccodewb,version))

####################Number of Vairables####################
K <- ncol(qogcs_pure)

#################### max number of covs to consider####################

J <- 100 

####################Number of Simulations####################
sims = 25000

####################Create results matrix to store sim results####################
results <- matrix(,nrow=sims, ncol=J)

####################Summary Statistics (Table 1: Left Column)####################
isnamatrix <- is.na(qogcs_pure)
prop.missing <- mean(colMeans(isnamatrix))
max.missing <- max(colMeans(isnamatrix))
min.missing <- min(colMeans(isnamatrix))
which(colMeans(isnamatrix)==min.missing)
####################6 Variables Fully Observed####################
table1_left <- matrix(, nrow=6, ncol=1)
table1_row_names <- c("Unit of Observations", "Number of Observations","Number of Variables","Proportion of Missing Values (Avg)","Proportion of Missing Values (Max)", "Number of Variables Fully Observed")
row.names(table1_left) <- table1_row_names
colnames(table1_left) <- "QoG"
qog_column <- c("Countries",nrow(qogcs_pure),ncol(qogcs_pure),prop.missing,max.missing,6)
table1_left[,1] <- qog_column
table1_left

####################Running Simulation####################
for(i in 1:sims){

##################### handle case of j = 1####################
	s <- sample.int(K,1,replace=FALSE)
    results[i,1] <- mean(isnamatrix[,s] == 0)
    
    results[i,2:J] <- sapply(2:J,function(j) mean(rowSums(isnamatrix[,sample.int(K,j,replace=FALSE)]) == 0))
    cat(i,"")
}


####################Output results####################
write.table(results,"SIM25000_qog.csv",row.names=FALSE, sep=",")


####################Input results#################### 
results <- read.csv("SIM25000_qog.csv")

####################Expected Proportion of Data Observed####################
expected <- colMeans(results)
J <- length(expected)
expected_table_qog <- cbind(c(1:J),c(expected)) 
colnames(expected_table_qog) <- c("Number of Variables","Expected Proportion of Data Observed")
expected_table_qog
####################Probability of All Rows Missing####################
prop <- colMeans(results==0)
J <- length(prop)
prop_table_qog <- cbind(c(1:J),prop)
colnames(prop_table_qog) <- c("Number of Variables","Probability All Rows Missing")
prop_table_qog
####################Generating Figure 1####################
#################Quartz Code is used to save the exact figure in the Mac OS environment, for windows and other environments ignore this command##################

quartz(width=8,height=4)

par(mfrow=c(1,2))

plot(y=c(1,expected),c(0:J),ylim=c(0,1),ylab="Expected Proportion Observed",xlab="Number of Variables",xlim=c(0,77),type="p", pch=20)

plot(y=c(0,prop),c(0:J),ylim=c(0,1),ylab="Probability All Rows Missing",xlab="Number of Variables",xlim=c(0,77),type="p", pch=20)

mtext("Missingness under Subsampling of Variables (QoG Data)", side=3,line=-3,outer=TRUE)



quartz.save("qog-figure.pdf", type = "pdf", device = dev.cur(), dpi = 600)


